Load Libraries
library(DistributionUtils)
## Loading required package: RUnit
library(scales)
library(ggplot2)
library(reshape)
library(data.table)
##
## Attaching package: 'data.table'
##
## The following object is masked from 'package:reshape':
##
## melt
library(IRanges)
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
##
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
##
## The following objects are masked from 'package:stats':
##
## IQR, mad, xtabs
##
## The following objects are masked from 'package:base':
##
## Filter, Find, Map, Position, Reduce, anyDuplicated, append,
## as.data.frame, as.vector, cbind, colnames, do.call,
## duplicated, eval, evalq, get, grep, grepl, intersect,
## is.unsorted, lapply, lengths, mapply, match, mget, order,
## paste, pmax, pmax.int, pmin, pmin.int, rank, rbind, rownames,
## sapply, setdiff, sort, table, tapply, union, unique, unlist,
## unsplit
##
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
##
## The following object is masked from 'package:reshape':
##
## rename
##
##
## Attaching package: 'IRanges'
##
## The following object is masked from 'package:data.table':
##
## shift
##
## The following object is masked from 'package:reshape':
##
## expand
library(GenomicRanges)
## Loading required package: GenomeInfoDb
wd = "/home/users/allstaff/tonkin-hill.g/assembly_testing/ComparisonForManuscript/"
Load input files
setwd(wd)
EIC_processed_contigs <- read.delim("/home/users/allstaff/tonkin-hill.g/assembly_testing/ComparisonForManuscript/SoapDeNovo_withHumanFilter/EICv2/analytics_output.txt")
ECS_processed_contigs <- read.delim("/home/users/allstaff/tonkin-hill.g/assembly_testing/ComparisonForManuscript/SoapDeNovo_withHumanFilter/ECS/analytics_output.txt")
CS2_processed_contigs <- read.delim("/home/users/allstaff/tonkin-hill.g/assembly_testing/ComparisonForManuscript/SoapDeNovo_withHumanFilter/CS2/analytics_output.txt")
setwd(wd)
plot_contigs <- function(blastData, length_filt, identity_filt, var_gene_lengths){
# blastData <-EIC_processed_contigs
# identity_filt <- 95
# length_filt <- 500
#first filter out unmatched contigs and convert types
blastData <- data.frame(subset(blastData, sequence.id != '-'))
blastData$alignment.length <-as.numeric(as.character(blastData$alignment.length))
blastData$s..start <-as.numeric(as.character(blastData$s..start))
blastData$s..end <-as.numeric(as.character(blastData$s..end))
blastData$q..start <-as.numeric(as.character(blastData$q..start))
blastData$q..end <-as.numeric(as.character(blastData$q..end))
blastData$percent.identity <-as.numeric(as.character(blastData$percent.identity))
#now filter out hits we aren't interested in
#blastData <- subset(blastData, alignment.length > length_filt) #throw away contigs less than 3 reads ~ length residues long
#blastData <- subset(blastData, alignment.length > 800) #throw away contigs less than 3 reads ~ length residues long
#blastData <- subset(blastData, percent.identity > identity_filt) #throw away alignments that share less than x%
#now split into seperate data frames for each database
var_hits <- subset(blastData, grepl("blastHits_Full_var_rask2010*",data.base.name))
var_hits <- subset(var_hits, alignment.length > length_filt) #throw away contigs less than 3 reads ~ length residues long
var_hits <- subset(var_hits, percent.identity > identity_filt) #throw away alignments that share less than x%
#var_hits <-subset(var_hits, (sequence.id == 'IT4var04') | (sequence.id == 'IT4var08'))
human_hits <- subset(blastData, grepl("blastHits_human_reference*",data.base.name))
Pf3D7_hits <- subset(blastData, grepl("blastHits_Pf3D7_reference*",data.base.name))
Pf3D7_hits <- subset(Pf3D7_hits, alignment.length > 100) #throw away contigs less than 3 reads ~ length residues long
Pf3D7_hits <- subset(Pf3D7_hits, percent.identity > 85) #throw away alignments that share less than x%
#filter out human and Pf3D7 contigs that aren't matched to any var genes
var_contigs <- unique(var_hits$contig.id)
human_hits <- subset(human_hits, contig.id %in% var_contigs)
Pf3D7_hits <- subset(Pf3D7_hits, contig.id %in% var_contigs)
#now annoying stuff so ggplot doesnt get upset (give each contig a different height?)
ydf <- data.frame(contig.id=unique(var_hits$contig.id) , y=as.numeric(factor(unique(var_hits$contig.id))))
var_hits <- merge(var_hits, ydf, by="contig.id")
human_hits <- merge(human_hits, ydf, by="contig.id")
Pf3D7_hits <- merge(Pf3D7_hits, ydf, by="contig.id")
#Now generate the underlying contigs (full length)
contig <- var_hits
contig$s..start <- contig$s..start - contig$q..start
contig$s..end <- contig$s..start+contig$contig.length
#now change the Pf3D7 sequence to be in terms of the var reference VERY HACKY
diff <- data.frame(contig.id=contig$contig.id, diff=contig$s..start, diff_seq=contig$sequence.id)
diff <- subset(diff, !duplicated(contig.id))
diff <- merge(diff, Pf3D7_hits, by="contig.id")
Pf3D7_hits$s..start <- diff$diff + Pf3D7_hits$q..start
Pf3D7_hits$s..end <- diff$diff + Pf3D7_hits$q..end
Pf3D7_hits$sequence.id <- diff$diff_seq
#get the gene lengths from rask data
var_gene_lengths <- var_gene_lengths[var_gene_lengths$sequence.id %in% intersect(var_gene_lengths$sequence.id, contig$sequence.id),]
gg <- ggplot(data=contig, aes(xmin = s..start, xmax = s..end, ymin = y, ymax = y+1, fill=factor(contig.id))) + geom_rect( alpha=0.4)
gg <- gg + geom_rect(data=var_hits, aes(xmin = s..start, xmax = s..end, ymin = y, ymax = y+1, fill=factor(contig.id), labels = rpk))
#gg <- gg + geom_rect(data=Pf3D7_hits, aes(xmin = s..start, xmax = s..end, ymin = y, ymax = y+1), color="black", alpha=0.5)
gg <- gg + geom_text(data=var_hits, aes(x=s..start+(s..end-s..start)/2, y=y+(y+1-y)/2, label=paste(contig.id,rpk, sep=" ")), size=8)
gg <- gg + geom_vline(data = var_gene_lengths, aes(xintercept = gene.length))
gg <- gg + facet_wrap(~sequence.id)
gg <- gg + xlab("location on reference sequence")
gg <- gg + ylab("") + scale_y_continuous(breaks=NULL)
gg <- gg + mytheme()
print(gg)
#paste(percent.identity,mismatches,gap.opens, sep=" ")
rpk_counts <- var_hits
rpk_counts$rpk <- rpk_counts$rpk*(rpk_counts$alignment.length/rpk_counts$contig.length)
rpk_counts <- rpk_counts[,(names(rpk_counts) %in% c('sequence.id','rpk'))]
rpk_counts <- aggregate(. ~ sequence.id, data=rpk_counts, sum)
names(rpk_counts) <- c("var.name", "contig.count")
return(rpk_counts)
}
Function to calculate redundancy
setwd(wd)
redundancy <- function(blastData, length_filt, var_gene_lengths, identity_filt=97){
# blastData <-CS2_processed_contigs
#first filter out unmatched contigs and convert types
blastData <- data.frame(subset(blastData, sequence.id != '-'))
blastData$alignment.length <- as.numeric(as.character(blastData$alignment.length))
blastData$s..start <- as.numeric(as.character(blastData$s..start))
blastData$s..end <- as.numeric(as.character(blastData$s..end))
blastData$q..start <- as.numeric(as.character(blastData$q..start))
blastData$q..end <- as.numeric(as.character(blastData$q..end))
blastData$percent.identity <- as.numeric(as.character(blastData$percent.identity))
blastData$alignment.length <- as.numeric(as.character(blastData$alignment.length))
blastData$bit.score <- as.numeric(as.character(blastData$bit.score))
#now split into seperate data frames for each database
var_hits <- subset(blastData, grepl("blastHits_Full_var_rask2010*",data.base.name))
var_hits <- subset(var_hits, alignment.length > length_filt) #throw away contigs less than 3 reads ~ length residues long
var_hits <- subset(var_hits, percent.identity > identity_filt) #throw away alignments that share less than x%
#var_hits <-subset(var_hits, (sequence.id == 'IT4var04') | (sequence.id == 'IT4var08'))
human_hits <- subset(blastData, grepl("blastHits_human_reference*",data.base.name))
Pf3D7_hits <- subset(blastData, grepl("blastHits_Pf3D7_reference*",data.base.name))
Pf3D7_hits <- subset(Pf3D7_hits, alignment.length > 100) #throw away contigs less than 3 reads ~ length residues long
Pf3D7_hits <- subset(Pf3D7_hits, percent.identity > 85) #throw away alignments that share less than x%
#filter out human and Pf3D7 contigs that aren't matched to any var genes
var_contigs <- unique(var_hits$contig.id)
human_hits <- subset(human_hits, contig.id %in% var_contigs)
Pf3D7_hits <- subset(Pf3D7_hits, contig.id %in% var_contigs)
#sort var hit by name then bit score
var_hits <- var_hits[with(var_hits, order(contig.id, bit.score, decreasing=TRUE)), ]
var_hits <- droplevels(var_hits)
#remove duplicated contigs which have overlapping query regions.
VH <- as.data.table(var_hits)
VH$q..start <- VH$q..start+250
VH$q..end <- VH$q..end-250
VH[,group := {
ir <- IRanges(q..start, q..end);
subjectHits(findOverlaps(ir, reduce(ir), minoverlap=0))
},by=contig.id]
VH$q..start <- VH$q..start-250
VH$q..end <- VH$q..end+250
VH <- data.frame(VH)
VH <- VH[!duplicated(VH[,colnames(VH) %in% c('contig.id', 'group')]),]
VH$group <- NULL
total.alignment <- sum(VH$alignment.length)
gr = GRanges(VH$sequence.id,
IRanges(VH$s..start, VH$s..end),
strand="*")
re <- reduce(gr)
total.coverage <- sum(width(ranges(re)))
redundancy <- total.alignment/total.coverage
return(redundancy)
}
Load more data
setwd(wd)
#expression data
var_expressions_mike <- read.csv("/wehisan/home/allstaff/t/tonkin-hill.g/assembly_testing/counts/var_expressions_mike.csv")
#read count data
read_counts_against_IT4_genes <- read.csv("/wehisan/home/allstaff/t/tonkin-hill.g/assembly_testing/counts/read_counts_against_IT4_genes.csv")
CS2_read_counts <- data.frame('var name'=read_counts_against_IT4_genes$CS2, 'counts'=read_counts_against_IT4_genes$counts)
ECS_read_counts <- data.frame('var name'=read_counts_against_IT4_genes$ECS, 'counts'=read_counts_against_IT4_genes$counts.1)
EIC_read_counts <- data.frame('var name'=read_counts_against_IT4_genes$EIC, 'counts'=read_counts_against_IT4_genes$counts.2)
#gene length data
var_gene_lengths <- read.delim("/wehisan/home/allstaff/t/tonkin-hill.g/find_var_genes/assemble_var/data/var_rask_lengths.txt", header=F)
colnames(var_gene_lengths) <- c("sequence.id", "gene.length")
var_gene_lengths <- data.frame(var_gene_lengths)
#rpkm
CS2_read_counts <- merge(CS2_read_counts, var_gene_lengths, by.x="var.name", by.y="sequence.id")
CS2_read_counts$counts <- CS2_read_counts$counts/CS2_read_counts$gene.length
CS2_read_counts$gene.length <- NULL
ECS_read_counts <- merge(ECS_read_counts, var_gene_lengths, by.x="var.name", by.y="sequence.id")
ECS_read_counts$counts <- ECS_read_counts$counts/ECS_read_counts$gene.length
ECS_read_counts$gene.length <- NULL
EIC_read_counts <- merge(EIC_read_counts, var_gene_lengths, by.x="var.name", by.y="sequence.id")
EIC_read_counts$counts <- EIC_read_counts$counts/EIC_read_counts$gene.length
EIC_read_counts$gene.length <- NULL
# #more rpkm
# EIC_processed_contigs <- merge(EIC_processed_contigs, var_gene_lengths, by.x="sequence.id", by.y="sequence.id")
# EIC_processed_contigs$rpk <- EIC_processed_contigs$rpk/EIC_processed_contigs$gene.length
# EIC_processed_contigs$gene.length <- NULL
For the contig plots we restrict the contigs to those that are longer than 1000 residues and share more than 95% similarity with the reference. Any contig that has overlapped with human, falciparum or vivax (non-var) by more than 30% has also been removed. In each of the contig plots below the lighter shaded blocks represent the contigs whilst the dark colouring represents those parts of the contigs that have aligned to the reference. If contigs map to more than one reference gene they can appear more than once.
Lets look at a plot of the contigs for ECS.
setwd(wd)
contig_counts <- plot_contigs(ECS_processed_contigs, 800, 95, var_gene_lengths)
Next we look at the redundancy of this assembly. Redundancy is calculated by first filtering contigs to those that align to VAR with a length of at least 800 and identity of 97. We then only allow each region of each transcript to only map to one reference region. This is done rather harshly in that if two blast hit overlap from on the same region of the transcript we take the one with the high bit score. Reduncdancy is then calculated as \[ \frac{\text{total alignment length}}{\text{total number of nucleotide in reference that are covered by hits}} \]
setwd(wd)
redundancy(ECS_processed_contigs, 800, var_gene_lengths, 95)
## [1] 1
For the expression plots I first aligned all the reads for each sample onto all the IT4var genes. This is reffered to as ‘whole gene count’. I then aligned all the reads for each sample onto the contigs output by our assembly pipeline. This is reffered to as ‘contig count’. Finally expression count are the 2exp-deltadeltaCt numbers from the quantification assay. All these counts are normalised and then plotted on a log10 scale. Thus each point on the y-axis can be interpreted as 0.1% of the total for that count type and that sample.
plot expression for ECS
setwd(wd)
ecs_df <- merge(var_expressions_mike, ECS_read_counts, by='var.name')
ecs_df <- ecs_df[,!(names(ecs_df) %in% c('CS2','EIC'))]
ecs_df$counts <- ecs_df$counts/sum(ecs_df$counts)*1000000
ecs_df$ECS <- ecs_df$ECS
ecs_df <- merge(ecs_df, contig_counts, all.x=TRUE, by='var.name')
ecs_df$contig.count[is.na(ecs_df$contig.count)] <- 0
ecs_df$contig.count <- ecs_df$contig.count/sum(ecs_df$contig.count)*1000000
names(ecs_df) <- c('Rask_name','Expression_name','qPCR expression value','whole gene RPKM','contig RPKM')
ecs_df2 <- melt(ecs_df, id=c('Rask_name','Expression_name'))
ecs <- ggplot(ecs_df2, aes(Rask_name, fill=factor(variable), weight=value)) + geom_bar(position="dodge")
ecs <- ecs + facet_wrap(~ variable, scales="free_y", ncol = 1)
ecs <- ecs + mybartheme()
ecs <- ecs + ylab("Expression profile (RPKM/qPCR)") + xlab("Gene Name")
ecs
Lets look at a plot of the contigs for CS2
setwd(wd)
contig_counts <- plot_contigs(CS2_processed_contigs, 800, 95, var_gene_lengths)
Redundancy
setwd(wd)
redundancy(CS2_processed_contigs, 800, var_gene_lengths, 95)
## [1] 1
plot expression for CS2
setwd(wd)
cs2_df <- merge(var_expressions_mike, CS2_read_counts, by='var.name')
cs2_df <- cs2_df[,!(names(cs2_df) %in% c('ECS','EIC'))]
cs2_df$counts <- cs2_df$counts/sum(cs2_df$counts)*1000000
cs2_df$CS2 <- cs2_df$CS2
cs2_df <- merge(cs2_df, contig_counts, all.x=TRUE, by='var.name')
cs2_df$contig.count[is.na(cs2_df$contig.count)] <- 0
cs2_df$contig.count <- cs2_df$contig.count/sum(cs2_df$contig.count)*1000000
names(cs2_df) <- c('Rask_name','Expression_name','qPCR expression value','whole gene RPKM','contig RPKM')
cs2_df2 <- melt(cs2_df, id=c('Rask_name','Expression_name'))
cs2 <- ggplot(cs2_df2, aes(Rask_name, fill=factor(variable), weight=value)) + geom_bar(position="dodge")
cs2 <- cs2 + scale_y_continuous(limits=c(0,NA))
cs2 <- cs2 + facet_wrap(~ variable, scales="free_y", ncol = 1)
cs2 <- cs2 + mybartheme()
cs2 <- cs2 + ylab("Expression profile (RPKM/qPCR)") + xlab("Gene Name")
cs2
Lets look at a plot of the contigs for EIC As their was a higher rate of similarities in EIC I’ve upped the similarity threshold to 97%
setwd(wd)
contig_counts <- plot_contigs(EIC_processed_contigs, 500, 95, var_gene_lengths)
Redundancy
setwd(wd)
redundancy(EIC_processed_contigs, 500, var_gene_lengths, 95)
## [1] 1.004301
plot expression for EIC
setwd(wd)
eic_df <- merge(var_expressions_mike, EIC_read_counts, by='var.name')
eic_df <- eic_df[,!(names(eic_df) %in% c('ECS','CS2'))]
eic_df$counts <- eic_df$counts/sum(eic_df$counts)*1000000
eic_df$EIC <- eic_df$EIC
eic_df <- merge(eic_df, contig_counts, all.x=TRUE, by='var.name')
eic_df$contig.count[is.na(eic_df$contig.count)] <- 0
eic_df$contig.count <- eic_df$contig.count/sum(eic_df$contig.count)*1000000
names(eic_df) <- c('Rask_name','Expression_name','qPCR expression value','whole gene RPKM','contig RPKM')
eic_df2 <- melt(eic_df, id=c('Rask_name','Expression_name'))
eic <- ggplot(eic_df2, aes(Rask_name, fill=factor(variable), weight=value)) + geom_bar(position="dodge")
eic <- eic + scale_y_continuous(limits=c(0,NA))
eic <- eic + mybartheme()
eic <- eic + facet_wrap(~ variable, scales="free_y", ncol = 1)
eic <- eic + ylab("Expression profile (RPKM/qPCR)") + xlab("Gene Name")
eic
Now we calculate some summary statistics for this comparison. First the number of IT4var genes found and the total found by qPCR above 1e-3.
setwd(wd)
countQPCRExpressed <- sum(eic_df$'qPCR expression value'>1e-3)
countContigExpressed <- sum(eic_df$'contig RPKM'>0)
A total of 43 were found above a threshold of \(1e-3\) by qPCR. After assembly a total of 40 genes were identified.
Now we look at the overall similarity of coverage. This is rather sensitive to single genes with large differences in exression profiles between the methods.
setwd(wd)
qPCR <- eic_df$'qPCR expression value'
qPCR[qPCR<1e-3] <- 0
contig <- eic_df$'contig RPKM'
The Spearman correlation in expression profiles is 0.6538091
The Pearson correlation in expression is 0.8044045
warnings()
## NULL